data source:
https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/state/new-york
# df1 =
# read.csv(file = "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv?_ga=2.213522960.567718552.1605752195-1031006592.1605647663")
# population
pop_df =
read.csv(file = "./covid_county_population_usafacts.csv") %>%
janitor::clean_names() %>%
filter(state == "NY") %>%
select(-state)
# deaths
death_df =
read.csv(file = "./covid_deaths_usafacts.csv") %>%
janitor::clean_names() %>%
filter(state == "NY") %>%
pivot_longer(
cols = 4:305,
names_to = "test_date",
values_to = "cummulative_death"
) %>%
mutate(
test_date = str_remove(test_date, pattern = "x"),
test_date = str_replace(test_date, pattern = "_", "/"),
test_date = str_replace(test_date, pattern = "_", "/"),
test_date = str_replace(test_date, pattern = "20$","2020")
) %>%
select(-state)
# join dfs
death_pop =
left_join(pop_df, death_df, by = "county_fips") %>%
select(county_fips, county_name.x, population, test_date, cummulative_death) %>%
rename("county" = "county_name.x") %>%
group_by(test_date, county) %>%
relocate(test_date, county_fips, county) %>%
mutate(
county = str_remove(county, pattern = ".County")
) %>%
select(-county_fips)
data source:
ny_statewide_data =
read.csv("./New_York_State_Statewide_COVID-19_Testing.csv") %>%
janitor::clean_names()
# join death_pop data into test data
ny_state_test_pop_mor =
left_join(ny_statewide_data, death_pop, by = c("test_date","county")) %>%
mutate(
mortality_rate = cummulative_death / cumulative_number_of_positives,
test_date = as.Date(test_date, "%m/%d/%y"),
month = months(test_date),
year = format(test_date, format = "%y"),
)
# visualize daily fatality rate
daily_fatal_plot1 =
ny_state_test_pop_mor %>%
ggplot(aes(test_date, mortality_rate, group = county)) +
geom_line(stat = "identity", aes(color = county)) +
theme_light() +
scale_x_date(name = "date", date_breaks = "1 month")
daily_fatal_plot1 =
daily_fatal_plot1 %>%
ggplotly()
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
daily_fatal_plot1
# aggregate cummulative variables by taking maximum() for monthly data
ny_month_df1 =
aggregate(
cbind(cumulative_number_of_positives,
cummulative_death,
cumulative_number_of_tests_performed)
~ month + county,
ny_state_test_pop_mor,
max) %>%
mutate(
month_fatal = cummulative_death / cumulative_number_of_positives
)
# aggregate incremental variables by taking sum() for montly data
ny_month_df2 =
aggregate(
cbind(new_positives,
total_number_of_tests_performed,
population)
~month + county,
ny_state_test_pop_mor,
sum)
# combine 2 dfs, complete monthly data for fatality rate
ny_state_monthly_testpopmor =
left_join(ny_month_df1, ny_month_df2, by = c("month", "county"))
# manipulation for variables
# visualization monthly fatality rate in each state, by each month
monthly_fatal_plot1 =
ny_state_monthly_testpopmor %>%
ggplot(aes(month, month_fatal, group = county)) +
geom_line(aes(color = county))+
theme_light()
monthly_fatal_plot1 =
monthly_fatal_plot1 %>%
ggplotly()
monthly_fatal_plot1
write_csv(ny_state_test_pop_mor, path = "./ny_state_test_pop_mor.csv")
write_csv(ny_state_monthly_testpopmor, path = "./ny_state_bymonth_testpopmor.csv")